import functools
import os
import re
import logging

from xToolkit import xfile

from checker.repo_clone import clone_repo_given_url, clone_repos_given_org_name, \
    init_repo_info_excel
from checker.utils import get_date, write_as_excel, request_get, clock
from checker.utils_str import transfer_file_path_to_raw_url, get_org_name_from_repo_url, get_repo_name_from_repo_url, \
    form_absolute_local_path, form_absolute_url_path, parse_url_with_encode_lang, \
    uniform_organization_name, get_repo_name_from_file_url, find_target_set_given_content_and_pattern, \
    is_anchor_valid_in_content, is_target_no_need_to_check, get_raw_file_url
from checker.utils_config import init_global_config_setting, init_repo_data_path_for_org, init_output_path, \
    get_access_token_from_config

REPO_STATUS_DICT = {}  # 记录仓库状态
PARAMETER_DICT = init_global_config_setting()  # 记录个性化参数配置
BASE_DIR = os.path.dirname(os.path.abspath(__file__))  # 当前程序所在目录
# 识别链接的正则的预编译
URL_PATTERN = re.compile(r'\bhttps?://[^\s\\()<>,`\[\]\*\'\"\|（）。，]+\b')
RELATIVE_PATH_PATTERN = re.compile(r'(?<=\]\()(?!.+://.+)(?!#.+)[^\s\\()<>,`\[\]\*\'\"\|（）。，]+(?=[)])')
ANCHOR_POINT_PATTERN = re.compile(r'(?<=\]\()#[^\s\\()<>,`\[\]\*\'\"\|（）。，]+(?=[)])')

logging.basicConfig(filename=BASE_DIR + '/check.log',
                    level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')
# logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')
# logging.disable(logging.DEBUG)


@clock
def check_target_link_for_single_url(file_url, output_file_name):
    """检查单个url所指向的远程md文件的中的目标链
    :param file_url: 待检查文件的url
    :param output_file_name: 输出报告的名字
    :return: 返回list内容样例：[dead_link1, dead_link2]
    """
    print(f'即将排查{file_url}中的目标链接')
    target_links_dict = get_target_link_dict_for_single_url(file_url)
    target_link_record_list = form_target_link_records_list(target_links_dict)
    print(f'所检测项：{file_url}')
    export_to_excel(output_file_name, target_link_record_list)
    return output_file_name


@clock
def check_target_link_for_local_file(file_name, output_file_name):
    """检查本地单一文本文件中的目标链
    :param file_name: 该单一文件的路径
    :param output_file_name: 若传入输出名时，生成excel文件
    :return: 返回list内容样例：[dead_link1, dead_link2]
    """
    print(f'即将排查{file_name}中的目标链接')
    target_links_dict = get_target_link_dict_for_local_file(file_name)
    target_link_record_list = form_target_link_records_list(target_links_dict)
    print(f'\n所检测项：{file_name}')
    export_to_excel(output_file_name, target_link_record_list)
    return output_file_name


@clock
def check_target_link_for_local_repos(repo_path, output_file_name, not_clock_me=False):
    """检查本地仓库中md文件中的目标链
    :param repo_path: 仓库地址
    :param output_file_name: 输出excel文件名
    :param not_clock_me: 作为子工具函数被调用时，无需单独计时, 用于@cloc装饰器中判定
    """
    print(f'即将排查{repo_path}下的目标链接')
    target_link_record_list = []
    file_list = get_repo_md_path_list(repo_path)
    for idx, file_name in enumerate(file_list):
        print(f'检查第{idx + 1}个文件， 总共{len(file_list)}个文件；当前文件：{file_name}')
        target_link_record_list.extend(get_target_link_record_list(file_name))
    print(f'\n所检测项：{repo_path}')
    export_to_excel(output_file_name, target_link_record_list)
    return output_file_name


@clock
def check_target_link_for_local_org_repos(organization, repo_path, output_file_name,
                                          ignore_3rd=True, ignore_docs=False, ignore_closed=True, not_clock_me=False):
    """检查组织本地仓库集中md文件中的目标链
    :param organization: 组织名
    :param repo_path: 仓库地址
    :param output_file_name: 输出excel文件名
    :param ignore_3rd: 是否忽略三方库
    :param ignore_docs: 是否忽略docs库
    :param ignore_closed: 是否忽略已经关闭的仓
    :param not_clock_me: 作为子工具函数被调用时，无需单独计时, 用于@cloc装饰器中判定
    """
    target_links_record_list = []
    print(f'正在扫描获取，{organization}组织仓库中md文件明细')
    file_list = get_repo_md_path_list(repo_path)
    for idx, file_name in enumerate(file_list):
        print(f'检查第{idx + 1}个文件， 总共{len(file_list)}个文件；当前文件：{file_name}')
        if not is_file_no_need_to_check(file_name, organization,
                                        ignore_3rd=ignore_3rd, ignore_docs=ignore_docs, ignore_closed=ignore_closed):
            target_links_record_list.extend(get_target_link_record_list(file_name, organization))
    print(f'\n所检测项：{repo_path}')
    export_to_excel(output_file_name, target_links_record_list)
    return output_file_name


def get_target_link_record_list(file_name, organization=None):
    target_links_dict = get_target_link_dict_for_local_file(file_name)
    record_list = form_target_link_records(target_links_dict, file_name, organization)
    return record_list


def form_target_link_records(target_links_dict, file_name, organization):
    record_list = []
    if target_links_dict:
        repo_name = get_repo_name_from_file_name(file_name, organization) if organization else None
        file_url = transfer_file_path_to_raw_url(file_name, organization, repo_name) if organization else None
        record_list = form_target_link_records_list(target_links_dict,
                                                    repo_name=repo_name, file_name=file_name, file_url=file_url)
    return record_list


def is_file_no_need_to_check(file_name, organization, ignore_3rd, ignore_docs, ignore_closed):
    repo_name = get_repo_name_from_file_name(file_name, organization)
    return is_repo_no_need_to_check(organization, repo_name,
                                    ignore_3rd=ignore_3rd, ignore_docs=ignore_docs, ignore_closed=ignore_closed)


def get_target_link_dict_for_single_url(file_url):
    md_content = get_file_content_from_url(get_raw_file_url(file_url))
    logging.debug(f'目前检查的url是：{file_url}, 将深入检查其中链接')
    file_target_links_dict = get_target_link_dict_given_content(md_content, file_url)
    return file_target_links_dict


def get_target_link_dict_for_local_file(file_name):
    file_content = get_file_content_from_local(file_name)
    logging.debug(f'目前检查的文件是：{file_name}, 将深入检查其中链接')
    file_target_links_dict = get_target_link_dict_given_content(file_content, file_name, for_local=True)
    return file_target_links_dict


def get_target_link_dict_given_content(content, file_id, for_local=False):
    target_link_dict = {}
    urls_in_content = find_target_set_given_content_and_pattern(content, URL_PATTERN)
    relative_paths_in_content = find_target_set_given_content_and_pattern(content, RELATIVE_PATH_PATTERN)
    anchor_points_in_content = find_target_set_given_content_and_pattern(content, ANCHOR_POINT_PATTERN)
    if PARAMETER_DICT['just_all_link']:
        target_link_dict['链接'] = urls_in_content
        target_link_dict['相对路径'] = relative_paths_in_content
        target_link_dict['锚点'] = anchor_points_in_content
        return target_link_dict
    if PARAMETER_DICT['broken_link']:
        target_link_dict['断链'] = pick_broken_link_set(urls_in_content)
    if PARAMETER_DICT['relative_path']:
        target_link_dict['失效相对路径'] = pick_broken_relative_path_set(relative_paths_in_content, file_id, for_local)
    if PARAMETER_DICT['closed_repo']:
        target_link_dict['关停仓链接'] = pick_closed_repo_link_set(urls_in_content)
    if PARAMETER_DICT['missed_anchor'] and 'template' not in file_id:
        target_link_dict['失效锚点'] = pick_missed_anchor_set(anchor_points_in_content, content)
    return target_link_dict


def get_repo_name_from_file_name(file_name, organization):
    init_repo_status_dict(organization)
    names = file_name.replace('\\', '/').split('/')
    for name in names:
        if name in REPO_STATUS_DICT[organization].keys():
            return name
    return ''


def init_repo_status_dict(organization):
    if organization not in REPO_STATUS_DICT:
        REPO_STATUS_DICT[organization] = {}
        repo_state_file = init_repo_info_excel(organization)
        repos_dict = xfile.read(repo_state_file).excel_to_dict(max=float('inf'))
        for idx, repo in enumerate(repos_dict):
            repo_name = repo['仓库名称']
            repo_status = repo['仓库状态']
            repo_description = repo['仓库描述']
            if repo_description.find('停止使用') >= 0:
                repo_status = '停止'
            REPO_STATUS_DICT[organization][repo_name] = repo_status


def is_repo_no_need_to_check(organization, repo_name, ignore_3rd, ignore_docs, ignore_closed):
    if ignore_3rd and 'third_party' in repo_name:
        return True
    if ignore_docs and repo_name == 'docs':
        return True
    if ignore_closed and is_repo_closed(organization, repo_name):
        return True
    return False


def is_repo_closed(organization, repo_to_check):
    init_repo_status_dict(organization)
    if repo_to_check in REPO_STATUS_DICT[organization].keys() \
            and REPO_STATUS_DICT[organization][repo_to_check] != '开始':
        return True
    return False


def get_repo_md_path_list(repo_directory):
    """获取本地仓库中的md文件地址
    :return: 输出格式示例 ['E:\\repo_name\\file_name1.md','E:\\repo_name\\file_name2.md']
    """
    md_file_list = []
    for root, dirs, files in os.walk(repo_directory):
        if os.path.basename(root) == ".git":
            continue
        for file in files:
            if file.endswith('.md'):
                file_path = os.path.join(root, file)
                logging.debug(f'扫描发现md文件：{file_path}')
                md_file_list.append(file_path)
    return md_file_list


def pick_broken_link_set(urls_in_content):
    file_deadlinks_set = set()
    for content_url in urls_in_content:
        if not is_target_no_need_to_check(content_url) and is_url_deadlink(content_url, other_exceptions):
            file_deadlinks_set.add(content_url)
    return file_deadlinks_set


def pick_broken_relative_path_set(relative_path_in_content, absolute_prefix, for_local=False):
    broken_path_set = set()
    for relative_path in relative_path_in_content:
        if not is_target_no_need_to_check(relative_path) \
                and is_relative_path_broken(relative_path, absolute_prefix, for_local):
            print(f'{relative_path}是失效相对路径')
            broken_path_set.add(relative_path)
    return broken_path_set


def pick_closed_repo_link_set(urls_in_content):
    closed_repo_link_set = set()
    for content_url in urls_in_content:
        if is_target_no_need_to_check(content_url):
            continue
        organization = get_org_name_from_repo_url(content_url)
        repo_name = get_repo_name_from_file_url(content_url, organization)
        if repo_name and is_organization_valid(organization) and is_repo_closed(organization, repo_name):
            print(f'{content_url}是关停仓的引用')
            closed_repo_link_set.add(content_url)
    return closed_repo_link_set


@functools.lru_cache()
def is_organization_valid(organization):
    access_token = get_access_token_from_config()
    url_to_test = f'https://gitee.com/api/v5/orgs/{organization}?access_token={access_token}'
    response_code = request_get(url_to_test).status_code
    if response_code == 401:
        print('access_token已失效，无法进行有效判定。')
        return False
    return response_code != 404


def pick_missed_anchor_set(anchor_points_in_content, content):
    closed_repo_link_set = set()
    for anchor_point in anchor_points_in_content:
        if not is_target_no_need_to_check(anchor_point) and not is_anchor_valid_in_content(anchor_point, content):
            print(f'{anchor_point}是失效的md锚点')
            closed_repo_link_set.add(anchor_point)
    return closed_repo_link_set


def get_file_content_from_local(file_path):
    logging.debug(f'准备获取文件内容：{file_path}')
    content = ''
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read()
    except Exception as err:
        logging.warning(f'获取文件内容{file_path}失败, 失败原因：{err}')
    return content


def get_file_content_from_url(file_url):
    logging.debug(f'准备获取文件内容：{file_url}')
    content = ''
    try:
        content = request_get(file_url).text
    except Exception as err:
        logging.warning(f'获取文件内容{file_url}失败, 失败原因：{err}')
    return content


@functools.lru_cache()
def is_url_deadlink(url, other_exception=None):
    logging.debug(f'尝试判断链接：{url}是否是断链')
    try:
        response = request_get(url)
    except Exception as err:
        logging.warning(f'尝试访问链接{url}失败, 失败原因：{err}')
    else:
        if response.status_code == 404:
            if not other_exception or not other_exception(url):
                print(f'{url}是404断链')
                return True
    return False


def is_path_broken(path):
    if not os.path.isfile(path) and not os.path.exists(path):
        print(f'{path}是无效相对路径')
        return True
    return False


def is_relative_path_broken(relative_path, absolute_prefix, for_local):
    if for_local:
        absolute_local_path = form_absolute_local_path(relative_path, absolute_prefix)
        absolute_local_path = parse_url_with_encode_lang(absolute_local_path)
        return not is_target_no_need_to_check(absolute_local_path) and is_path_broken(absolute_local_path)
    else:
        absolute_url = form_absolute_url_path(relative_path, absolute_prefix)
        return not is_target_no_need_to_check(absolute_url) and is_url_deadlink(absolute_url, other_exceptions)


def other_exceptions(url):
    """
    临时解决网站自动重定向后，可访问的问题
    其重定向方式，集将网址中tree和blob互换
    """
    if 'blob' in url:
        redirect_url = url.replace('blob', 'tree')
        if not is_url_deadlink(redirect_url):
            return True
    if 'tree' in url:
        redirect_url = url.replace('tree', 'blob')
        if not is_url_deadlink(redirect_url):
            return True
    return False


def form_target_link_records_list(target_links_dict, repo_name=None, file_name=None, file_url=None):
    output_record_list = []
    for link_type, target_link_set in target_links_dict.items():
        for link in target_link_set:
            target_link_record = form_target_link_record_dict(link, repo_name, file_name, file_url, link_type)
            output_record_list.append(target_link_record)
    return output_record_list


def form_target_link_record_dict(target_link, repo_name=None, file_name=None, file_url=None, link_type=None):
    target_link_record_dict = {'文档中目标链': target_link}
    if repo_name:
        target_link_record_dict['仓库名称'] = repo_name
    if file_name:
        target_link_record_dict['文档名称'] = file_name
    if file_url:
        target_link_record_dict['文档链接-raw'] = file_url
        target_link_record_dict['文档链接-blob'] = file_url.replace('/raw/master/', '/blob/master/')
    if link_type:
        target_link_record_dict['目标链类型'] = link_type
    return target_link_record_dict


def get_columns_order(target_link_list):
    default_columns_order = ['仓库名称', '文档名称', '文档链接-raw', '文档中目标链', '文档链接-blob', '目标链类型']
    columns_order = []
    if target_link_list:
        columns_in_record = target_link_list[0].keys()
        for column in default_columns_order:
            if column in columns_in_record:
                columns_order.append(column)
    return columns_order


def export_to_excel(output_file_name, target_link_list, columns_order=None):
    if not columns_order:
        columns_order = get_columns_order(target_link_list)
    count = write_as_excel(output_file_name, target_link_list, columns_order)
    print(f'链接数目：所检测项中，找到目标链{count}个')
    if count > 0:
        print(f'结果文件：{output_file_name}')


@clock
def clone_and_check_for_repo_url(repo_url, output_file_name, clone_to_where=None):
    print(f'即将排查{repo_url}仓库中的目标链接')
    if not clone_to_where:
        organization = get_org_name_from_repo_url(repo_url)
        clone_to_where = init_repo_data_path_for_org(organization)
    branch_or_tag = PARAMETER_DICT['branch_or_tag']
    print(f'正在检出仓库{repo_url}的{branch_or_tag}分支')
    clone_repo_given_url(repo_url, clone_to_where, branch_or_tag=branch_or_tag)
    repo_name = get_repo_name_from_repo_url(repo_url)
    repo_dir = os.path.join(clone_to_where, repo_name)
    check_target_link_for_local_repos(repo_dir, output_file_name, not_clock_me=True)


@clock
def clone_and_check_for_organization_repos(organization, output_file_name, clone_to_where=None):
    print(f'即将排查{organization}组织下各仓库中的目标链接')
    organization = uniform_organization_name(organization)
    if not clone_to_where:
        clone_to_where = init_repo_data_path_for_org(organization)
    clone_repos_given_org_name(organization, clone_to_where, branch_or_tag=PARAMETER_DICT['branch_or_tag'])
    check_target_link_for_local_org_repos(organization, clone_to_where, output_file_name, not_clock_me=True)


if __name__ == '__main__':
    output_path = init_output_path()
    # 方式一：检查url浅层中的目标链 -u
    test_url = 'https://gitee.com/openharmony-retired/drivers_adapter_uhdf/raw/master/README.md'
    signal_url_report = os.path.join(output_path, get_date() + '_md链接_目标链排查结果报告.xlsx')
    check_target_link_for_single_url(test_url, signal_url_report)

    # 方式二：检查本地指定文本文件中的目标链 -f
    test_file_path = r'D:/codes/DEAD_LINK_CHECKERS\openharmony_repos/docs\en\readme\common-event-notificDation.md'
    single_file_report = os.path.join(output_path, get_date() + '_本地文件_目标链排查结果报告.xlsx')
    check_target_link_for_local_file(test_file_path, single_file_report)

    # 方式三：检查本地仓库中所有md文件中的目标链 -d
    test_repo_path = r'D:/codes/DEAD_LINK_CHECKERS\openharmony-retired_repos/drivers_adapter_uhd'
    local_repo_report = os.path.join(output_path, get_date() + '_本地仓库_目标链排查结果报告.xlsx')
    check_target_link_for_local_repos(test_repo_path, local_repo_report)

    # 方式四：传入仓库URL，自动clone，排查目标链 -r
    test_repo_url = 'https://gitee.com/openharmony-retired/drivers_adapter_khdf_liteos'
    repo_url_report = os.path.join(output_path, get_date() + '_链接仓库_目标链排查结果报告.xlsx')
    clone_and_check_for_repo_url(test_repo_url, repo_url_report)

    # 方式五：传入Gitee社区组织名，自动clone，排查组织的目标链 -o
    test_organization = 'openharmony'
    org_repos_report = os.path.join(output_path, get_date() + '_组织仓库_目标链排查结果报告.xlsx')
    clone_and_check_for_organization_repos(test_organization, org_repos_report)
